import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import plotly.express as px
資料科學是一門將資料(Data)提煉為資訊(Information)的學科,提煉過程中可能包含資料載入、資料操作、探索性分析以及監督式學習等。
只有文字所構成的電腦檔案,不包含字型的樣式或者段落標記,能夠使用最簡單的文字編輯器(例如 Windows 的「記事本」、macOS 的 TextEdit)直接開啟檢視。
,)、分號(;)、Tab 鍵(\t)等。.csv 意指逗號分隔值(Comma-separated values)。pd.read_csv() 函數載入純文字檔案¶import pandas as pd 載入 Pandasdf.head() 檢視前五列。data_url = "https://raw.githubusercontent.com/datainpoint/classroom-data-visualization/main/data/daily_report.csv"
daily_report = pd.read_csv(data_url)
daily_report.head() # show the first 5 rows
| Combined_Key | Last_Update | Confirmed | Deaths | |
|---|---|---|---|---|
| 0 | Afghanistan | 2022-04-21 04:20:46 | 178574 | 7680 |
| 1 | Albania | 2022-04-21 04:20:46 | 274606 | 3496 |
| 2 | Algeria | 2022-04-21 04:20:46 | 265746 | 6874 |
| 3 | Andorra | 2022-04-21 04:20:46 | 41013 | 153 |
| 4 | Angola | 2022-04-21 04:20:46 | 99287 | 1900 |
data_url = "https://raw.githubusercontent.com/datainpoint/classroom-data-visualization/main/data/lookup_table.csv"
lookup_table = pd.read_csv(data_url)
lookup_table.head() # show the first 5 rows
| UID | Combined_Key | iso2 | iso3 | Country_Region | Province_State | Admin2 | Lat | Long_ | Population | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | Afghanistan | AF | AFG | Afghanistan | NaN | NaN | 33.93911 | 67.709953 | 38928341.0 |
| 1 | 8 | Albania | AL | ALB | Albania | NaN | NaN | 41.15330 | 20.168300 | 2877800.0 |
| 2 | 10 | Antarctica | AQ | ATA | Antarctica | NaN | NaN | -71.94990 | 23.347000 | NaN |
| 3 | 12 | Algeria | DZ | DZA | Algeria | NaN | NaN | 28.03390 | 1.659600 | 43851043.0 |
| 4 | 20 | Andorra | AD | AND | Andorra | NaN | NaN | 42.50630 | 1.521800 | 77265.0 |
data_url = "https://raw.githubusercontent.com/datainpoint/classroom-data-visualization/main/data/time_series.csv"
time_series = pd.read_csv(data_url)
time_series.head() # show the first 5 rows
| Date | Country_Region | Confirmed | Deaths | Daily_Cases | Daily_Deaths | |
|---|---|---|---|---|---|---|
| 0 | 2020-01-22 | Afghanistan | 0 | 0 | 0 | 0 |
| 1 | 2020-01-22 | Albania | 0 | 0 | 0 | 0 |
| 2 | 2020-01-22 | Algeria | 0 | 0 | 0 | 0 |
| 3 | 2020-01-22 | Andorra | 0 | 0 | 0 | 0 |
| 4 | 2020-01-22 | Angola | 0 | 0 | 0 | 0 |
pd.read_excel()pd.read_sql()When it comes to making graphs, half the battle occurs before you call any graphing commands.
country_list = ["US", "United Kingdom", "France", "Germany", "Canada", "Korea, South", "Japan", "Singapore", "Australia", "Taiwan", "New Zealand"]
daily_report_merge_lookup_table = pd.merge(daily_report, lookup_table, left_on="Combined_Key", right_on="Combined_Key")
filtered_daily_report_merge_lookup_table = daily_report_merge_lookup_table[daily_report_merge_lookup_table["Country_Region"].isin(country_list)]
data_for_bar = filtered_daily_report_merge_lookup_table.groupby('Country_Region')['Confirmed'].sum().sort_values(ascending=False)
data_for_bar
Country_Region US 80801162 France 28162002 Germany 23844536 United Kingdom 22060704 Korea, South 16674045 Japan 7484263 Australia 5563493 Canada 3669173 Singapore 1170970 New Zealand 865522 Taiwan 40186 Name: Confirmed, dtype: int64
data_for_line = time_series[time_series["Country_Region"].isin(country_list)].reset_index(drop=True)
data_for_line
| Date | Country_Region | Confirmed | Deaths | Daily_Cases | Daily_Deaths | |
|---|---|---|---|---|---|---|
| 0 | 2020-01-22 | Australia | 0 | 0 | 0 | 0 |
| 1 | 2020-01-22 | Canada | 0 | 0 | 0 | 0 |
| 2 | 2020-01-22 | France | 0 | 0 | 0 | 0 |
| 3 | 2020-01-22 | Germany | 0 | 0 | 0 | 0 |
| 4 | 2020-01-22 | Japan | 2 | 0 | 2 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 9015 | 2022-04-20 | New Zealand | 865522 | 600 | 10574 | 18 |
| 9016 | 2022-04-20 | Singapore | 1170970 | 1319 | 3472 | 2 |
| 9017 | 2022-04-20 | Taiwan | 40186 | 856 | 2476 | 2 |
| 9018 | 2022-04-20 | US | 80801713 | 990208 | 68781 | 877 |
| 9019 | 2022-04-20 | United Kingdom | 22060704 | 173012 | 27321 | 514 |
9020 rows × 6 columns
confirmed_by_lat_long = daily_report_merge_lookup_table.groupby(['Combined_Key', 'Country_Region', "Province_State", "Admin2", "Lat", "Long_"])['Confirmed'].sum()
data_for_geo = pd.DataFrame(confirmed_by_lat_long).reset_index()
data_for_geo
| Combined_Key | Country_Region | Province_State | Admin2 | Lat | Long_ | Confirmed | |
|---|---|---|---|---|---|---|---|
| 0 | Abbeville, South Carolina, US | US | South Carolina | Abbeville | 34.223334 | -82.461707 | 6644 |
| 1 | Acadia, Louisiana, US | US | Louisiana | Acadia | 30.295065 | -92.414197 | 15039 |
| 2 | Accomack, Virginia, US | US | Virginia | Accomack | 37.767072 | -75.632346 | 6957 |
| 3 | Ada, Idaho, US | US | Idaho | Ada | 43.452658 | -116.241552 | 134084 |
| 4 | Adair, Iowa, US | US | Iowa | Adair | 41.330756 | -94.471059 | 1583 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 3199 | Yuma, Arizona, US | US | Arizona | Yuma | 32.768957 | -113.906667 | 62659 |
| 3200 | Yuma, Colorado, US | US | Colorado | Yuma | 40.003468 | -102.425867 | 1883 |
| 3201 | Zapata, Texas, US | US | Texas | Zapata | 27.001564 | -99.169872 | 3683 |
| 3202 | Zavala, Texas, US | US | Texas | Zavala | 28.866172 | -99.760508 | 4049 |
| 3203 | Ziebach, South Dakota, US | US | South Dakota | Ziebach | 44.978819 | -101.665462 | 663 |
3204 rows × 7 columns
import matplotlib.pyplot as plt 載入。# Plotting with Matplotlib
fig, ax = plt.subplots()
ax.barh(data_for_bar.sort_values().index, data_for_bar.sort_values().values)
ax.set_title("Confirmed By Country")
ax.set_xlabel("Number of Confirmed")
plt.show()
# Plotting with Seaborn
fig = sns.lineplot(x="Date", y="Confirmed", hue="Country_Region", data=data_for_line)
fig.set(xticks=[])
/var/folders/0b/r__z5mpn6ldgb_w2j7_y_ntr0000gn/T/ipykernel_55764/599519132.py:3: MatplotlibDeprecationWarning: Support for passing numbers through unit converters is deprecated since 3.5 and support will be removed two minor releases later; use Axis.convert_units instead. fig.set(xticks=[])
[[]]
# Plotting with Folium
url = "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
state_geo = f"{url}/us-states.json"
m = folium.Map(location=[48, -102], zoom_start=3)
folium.Choropleth(
geo_data=state_geo,
name="choropleth",
data=data_for_geo,
columns=["Province_State", "Confirmed"],
key_on="feature.properties.name",
fill_color="YlGn",
fill_opacity=0.7,
line_opacity=0.2,
legend_name="Confirmed",
).add_to(m)
folium.LayerControl().add_to(m)
<folium.map.LayerControl at 0x7fe2c46f0610>
m
# Plotting with Plotly
fig = px.bar(pd.DataFrame(data_for_bar).reset_index(), x="Confirmed", y="Country_Region", color="Country_Region")
fig.show()
# Plotting with Plotly
fig = px.line(data_for_line, x="Date", y="Confirmed",
color="Country_Region", line_group="Country_Region", hover_name="Country_Region",
line_shape="spline", render_mode="svg")
fig.show()
# Plotting with Plotly
fig = px.scatter_mapbox(data_for_geo, lat="Lat", lon="Long_", size="Confirmed", color="Confirmed",
size_max=50, mapbox_style="carto-positron", zoom=3, hover_name="Combined_Key")
fig.show()